PanMyeloid Assignment–Data Exploration
Data Exploration Notebook
1. Environment Setting
# Single-cell required
library(Seurat)
library(SeuratData)
library(SeuratDisk)
library(SeuratWrappers)
# For data analysis
library(rlang)
library(dplyr)
library(tidyr)
library(stringr)
# For plotting
library(ggplot2)
library(cowplot)
library(ggridges)
library(ggsci)packageVersion("Seurat")## [1] '4.0.4'
2. Import and Examine Data
hfile <- Connect("../processedData/alldata.obj/regressed-alldata.h5seurat")
hfile$index()## Data for assay RNA★ (default assay)
## counts data scale.data
## ✔ ✔ ✔
# Convert("../processedData/alldata.obj/regressed-alldata.h5ad", dest = "h5seurat", overwrite = TRUE)
alldata <- LoadH5Seurat("../processedData/alldata.obj/regressed-alldata.h5seurat",meta.data=F)
alldata.meta <- read.csv("../processedData/alldata.obj/alldata-metadata.csv",header = T, row.names = 1)
alldata@meta.data <- alldata.meta
alldata## An object of class Seurat
## 10390 features across 45251 samples within 1 assay
## Active assay: RNA (10390 features, 0 variable features)
head(alldata@meta.data)alldata@meta.data %>%
ggplot(aes(x=patient, fill=patient)) +
geom_bar() +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) +
theme(plot.title = element_text(hjust=0.5, face="bold"),legend.position="none") +
ggtitle("NCells") * Cell number check
cell_tech <- alldata@meta.data %>%
ggplot(aes(fill=tech_10X, x=tech_10X)) +
geom_bar(alpha = 0.75) + ggtitle("NCells") +
theme_classic() +
scale_fill_npg()
cell_cancer <- alldata@meta.data %>%
ggplot(aes(fill=cancer, x=cancer)) +
geom_bar(alpha = 0.75) + ggtitle("NCells") +
theme_classic() +
scale_fill_simpsons()
plot_grid(cell_tech,cell_cancer,rel_widths = c(1,2))- UMI counts (transcripts) per cell
count_tech <- alldata@meta.data %>%
ggplot(aes(fill=tech_10X, x=n_counts)) + ggtitle("NCounts") +
geom_density(alpha = 0.3) +
scale_x_log10() +
theme_classic() +
scale_fill_npg()
count_cancer <- alldata@meta.data %>%
ggplot(aes(fill=cancer, x=n_counts)) + ggtitle("NCounts") +
geom_density(alpha = 0.3) +
scale_x_log10() +
theme_classic() +
scale_fill_simpsons()
plot_grid(count_tech,count_cancer,rel_widths = c(2,2))3. Dimensionality Reduction
alldata <- FindVariableFeatures(alldata,selection.method = "vst",nfeatures = 2000)
alldata <- RunPCA(alldata, npcs = 100, verbose = FALSE)
ElbowPlot(alldata,ndims = 100)# Set dim
pc.num=1:100# Run UMAP
alldata <- RunUMAP(alldata, dims = pc.num, reduction = "pca")
alldata <- RunTSNE(alldata, dims = pc.num)p1 <- DimPlot(alldata, group.by = "patient", reduction = "umap") + NoLegend()
p2 <- DimPlot(alldata, group.by = "patient", reduction = "tsne") + NoLegend()
plot_grid(p1,p2,rel_widths = c(2,2))p3 <- DimPlot(alldata, group.by = "tech_10X", reduction = "umap")
p4 <- DimPlot(alldata, group.by = "tech_10X", reduction = "tsne")
plot_grid(p3,p4,rel_widths = c(2,2))p5 <- DimPlot(alldata, group.by = "tech", reduction = "umap")
p6 <- DimPlot(alldata, group.by = "tech", reduction = "tsne")
plot_grid(p5,p6,rel_widths = c(2,2))p7 <- DimPlot(alldata, group.by = "MajorCluster", reduction = "umap")
p8 <- DimPlot(alldata, group.by = "MajorCluster", reduction = "tsne")
plot_grid(p7,p8,rel_widths = c(2,2))4. Harmony Integration
library(harmony)
library(Rcpp)
library(rlang)dat.h <- alldataintegrated.by.both.h <- harmony::RunHarmony(dat.h, group.by.vars =c("tech_10X","patient"), assay.use = "RNA", plot_convergence = TRUE)# Run UMAP
integrated.by.both.h <- RunUMAP(integrated.by.both.h, dims = 1:100, reduction = "harmony")DimPlot(integrated.by.both.h, group.by = c("tech_10X", "MajorCluster"), ncol = 2,reduction = "umap")p_cls <- DimPlot(object = integrated.by.both.h, group.by = "MajorCluster", reduction = "umap")
p_patient <- DimPlot(object = integrated.by.both.h, group.by = "patient", reduction = "umap") +NoLegend()
plot_grid(p_patient,p_cls,rel_widths = c(2,2))5. LIGER Integration
library(rliger)dat.l <- alldatadat.l <- ScaleData(dat.l, split.by = "tech_10X", do.center = FALSE)
dat.l <- RunOptimizeALS(dat.l, k = 20, lambda = 5, split.by = "tech_10X",)##
|
| | 0%
|
|== | 3%
|
|===== | 7%
|
|======= | 10%
|
|========= | 13%
|
|============ | 17%
|
|============== | 20%
|
|================ | 23%
|
|=================== | 27%
|
|===================== | 30%
|
|======================= | 33%
|
|========================== | 37%
|
|============================ | 40%
|
|============================== | 43%
|
|================================= | 47%
|
|=================================== | 50%
|
|===================================== | 53%
|
|======================================== | 57%
|
|========================================== | 60%
|
|============================================ | 63%
|
|=============================================== | 67%
|
|================================================= | 70%
|
|=================================================== | 73%
|
|====================================================== | 77%
|
|======================================================== | 80%
|
|========================================================== | 83%
|
|============================================================= | 87%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Finished in 7.739579 mins, 30 iterations.
## Max iterations set: 30.
## Final objective delta: 1.651939e-05.
## Best results with seed 1.
dat.l <- RunQuantileNorm(dat.l, split.by = "tech_10X")# Dimensional reduction and plotting
dat.l <- RunUMAP(dat.l, dims = 1:ncol(dat.l[["iNMF"]]), reduction = "iNMF")
DimPlot(dat.l, group.by = c("tech_10X", "MajorCluster"), ncol = 2)6. Seurat3 Integration
Posing error: caught segfault, address (nil), cause ‘memory not mapped’
Issue#4628 exists on GitHub with bug label but no responses.
Maybe due to insufficient menory? Anyway, just show the codes below.
dat.s <- alldatasplit_seurat <- SplitObject(dat.s, split.by = "patient")
# Normalize and identify variable features for each dataset independently
split_seurat <- lapply(X = split_seurat, FUN = function(x) {
x <- NormalizeData(x)
x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000)
})
# select features that are repeatedly variable across datasets for integration
integ_features <- SelectIntegrationFeatures(object.list = split_seurat)Perform CCA, find the best buddies or anchors and filter incorrect anchors.
# Find best buddies - can take a while to run
integ_anchors <- FindIntegrationAnchors(object.list = split_seurat,
normalization.method = "LogNormalize",
anchor.features = integ_features,
dims = 1:20, reduction = "rpca")Integrate across conditions.
# Integrate across conditions
integrated.s <- IntegrateData(anchorset = integ_anchors, normalization.method = "LogNormalize")7. LISI Evaluation
# devtools::install_github("immunogenomics/lisi")
library(lisi)
library(Rcpp)7.1 Check Before Integration
umap.co <- Embeddings(object =alldata[["umap"]])
meta.df <- data.frame(patient = alldata$patient, tech = alldata$tech, tech_10X = alldata$tech_10X)umap.res <- compute_lisi(umap.co, meta.df, c("patient", "tech", "tech_10X"))
# save(umap.res,tsne.res,file = "../processedData/tmp.data/ori-lisi-res.RData")summary(umap.res)## patient tech tech_10X
## Min. : 1.000 Min. :1.000 Min. :1.000
## 1st Qu.: 1.294 1st Qu.:1.000 1st Qu.:1.000
## Median : 2.564 Median :1.000 Median :1.025
## Mean : 3.264 Mean :1.006 Mean :1.170
## 3rd Qu.: 4.709 3rd Qu.:1.000 3rd Qu.:1.228
## Max. :13.282 Max. :2.000 Max. :2.607
Each row in the output data frame corresponds to a cell from X. The score (e.g. max 13.282) indicates the effective number of different categories represented in the local neighborhood of each cell. If the cells are well-mixed, then we might expect the LISI score to be near 46 for a categorical variable with 46 categories(e.g. “patient”).
vis_ori <- rbind(data.frame(Score = umap.res$patient, Batch = "Patient", Method = "None"),
data.frame(Score = umap.res$tech, Batch = "Tech", Method = "None"),
data.frame(Score = umap.res$tech_10X, Batch = "Tech_10X", Method = "None"))7.2 Harmony Output
h.both.umap.co <- Embeddings(object =integrated.by.both.h[["umap"]])
h.both.umap.res <- compute_lisi(h.both.umap.co, meta.df, c("patient", "tech", "tech_10X"))summary(h.both.umap.res)## patient tech tech_10X
## Min. : 1.000 Min. :1.000 Min. :1.000
## 1st Qu.: 2.976 1st Qu.:1.000 1st Qu.:1.145
## Median : 5.123 Median :1.002 Median :1.463
## Mean : 5.414 Mean :1.054 Mean :1.520
## 3rd Qu.: 7.510 3rd Qu.:1.039 3rd Qu.:1.851
## Max. :17.281 Max. :2.000 Max. :3.000
vis_h <- rbind(data.frame(Score = h.both.umap.res$patient, Batch = "Patient", Method = "Harmony"),
data.frame(Score = h.both.umap.res$tech, Batch = "Tech", Method = "Harmony"),
data.frame(Score = h.both.umap.res$tech_10X, Batch = "Tech_10X", Method = "Harmony"))7.3 LIGER Output
l.both.umap.co <- Embeddings(object =dat.l[["umap"]])
l.both.umap.res <- compute_lisi(l.both.umap.co, meta.df, c("patient", "tech", "tech_10X"))summary(l.both.umap.res)## patient tech tech_10X
## Min. : 1.000 Min. :1.000 Min. :1.000
## 1st Qu.: 2.418 1st Qu.:1.000 1st Qu.:1.145
## Median : 4.442 Median :1.006 Median :1.448
## Mean : 4.804 Mean :1.060 Mean :1.500
## 3rd Qu.: 6.832 3rd Qu.:1.064 3rd Qu.:1.812
## Max. :17.347 Max. :2.000 Max. :2.992
vis_l <- rbind(data.frame(Score = l.both.umap.res$patient, Batch = "Patient", Method = "LIGER"),
data.frame(Score = l.both.umap.res$tech, Batch = "Tech", Method = "LIGER"),
data.frame(Score = l.both.umap.res$tech_10X, Batch = "Tech_10X", Method = "LIGER"))7.4 Scanorama Output
# Convert("../processedData/alldata.obj/finish-integration-with-umap.h5ad", dest = "h5seurat", overwrite = TRUE)
scanorama <- LoadH5Seurat("../processedData/alldata.obj/finish-integration-with-umap.h5seurat",meta.data=F,reductions = "umap")
scanorama.meta <- read.csv("../processedData/alldata.obj/alldata-metadata.csv",header = T, row.names = 1)
scanorama@meta.data <- scanorama.meta
scanorama## An object of class Seurat
## 10390 features across 45251 samples within 1 assay
## Active assay: RNA (10390 features, 0 variable features)
## 1 dimensional reduction calculated: umap
scanorama.umap.co <- Embeddings(object =scanorama[["umap"]])
scanorama.umap.res <- compute_lisi(scanorama.umap.co, meta.df, c("patient", "tech", "tech_10X"))summary(scanorama.umap.res)## patient tech tech_10X
## Min. : 1.000 Min. :1.000 Min. :1.000
## 1st Qu.: 1.640 1st Qu.:1.000 1st Qu.:1.000
## Median : 2.977 Median :1.000 Median :1.004
## Mean : 3.864 Mean :1.054 Mean :1.169
## 3rd Qu.: 5.692 3rd Qu.:1.000 3rd Qu.:1.161
## Max. :15.565 Max. :2.000 Max. :2.971
vis_sc <- rbind(data.frame(Score = scanorama.umap.res$patient, Batch = "Patient", Method = "Scanorama"),
data.frame(Score = scanorama.umap.res$tech, Batch = "Tech", Method = "Scanorama"),
data.frame(Score = scanorama.umap.res$tech_10X, Batch = "Tech_10X", Method = "Scanorama"))7.5 scGen Output
scgen <- read.csv("../processedData/alldata.obj/scgen-data-umap.csv")
scgen.umap.res <- compute_lisi(scanorama.umap.co, meta.df, c("patient", "tech", "tech_10X"))summary(scgen.umap.res)## patient tech tech_10X
## Min. : 1.000 Min. :1.000 Min. :1.000
## 1st Qu.: 1.640 1st Qu.:1.000 1st Qu.:1.000
## Median : 2.977 Median :1.000 Median :1.004
## Mean : 3.864 Mean :1.054 Mean :1.169
## 3rd Qu.: 5.692 3rd Qu.:1.000 3rd Qu.:1.161
## Max. :15.565 Max. :2.000 Max. :2.971
vis_scgen <- rbind(data.frame(Score = scgen.umap.res$patient, Batch = "Patient", Method = "scGen"),
data.frame(Score = scgen.umap.res$tech, Batch = "Tech", Method = "scGen"),
data.frame(Score = scgen.umap.res$tech_10X, Batch = "Tech_10X", Method = "scGen"))7.6 Visualization
vis_all <- rbind(vis_ori,vis_h,vis_l,vis_sc,vis_scgen)ggplot(vis_all, aes(x=Method, y=Score, fill=Method)) +
geom_violin(position="dodge") +
geom_boxplot(width=0.1, color="gray30", alpha=0.3, outlier.shape = 21,outlier.colour="transparent") +
facet_wrap(~Batch, scale="free") +
xlab("") + ylab("log10(Score)") +
scale_fill_simpsons(alpha = 0.8) + scale_y_log10()8. Supplemental Information
▶ Session Info
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.3 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=zh_CN.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=zh_CN.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=zh_CN.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=zh_CN.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] lisi_1.0 rliger_1.0.0
## [3] patchwork_1.1.1 Matrix_1.3-4
## [5] harmony_0.1.0 Rcpp_1.0.7
## [7] ggsci_2.9 ggridges_0.5.3
## [9] cowplot_1.1.1 ggplot2_3.3.5
## [11] stringr_1.4.0 tidyr_1.1.4
## [13] dplyr_1.0.7 rlang_0.4.11
## [15] SeuratWrappers_0.3.0 SeuratDisk_0.0.0.9019
## [17] stxBrain.SeuratData_0.1.1 SeuratData_0.2.1
## [19] SeuratObject_4.0.2 Seurat_4.0.4
##
## loaded via a namespace (and not attached):
## [1] plyr_1.8.6 igraph_1.2.6 lazyeval_0.2.2
## [4] splines_4.1.1 listenv_0.8.0 scattermore_0.7
## [7] digest_0.6.28 foreach_1.5.1 htmltools_0.5.2
## [10] fansi_0.5.0 magrittr_2.0.1 tensor_1.5
## [13] cluster_2.1.2 doParallel_1.0.16 ROCR_1.0-11
## [16] remotes_2.4.1 globals_0.14.0 matrixStats_0.61.0
## [19] R.utils_2.11.0 spatstat.sparse_2.0-0 rmdformats_1.0.2
## [22] colorspace_2.0-2 rappdirs_0.3.3 ggrepel_0.9.1
## [25] xfun_0.26 crayon_1.4.1 jsonlite_1.7.2
## [28] spatstat.data_2.1-0 iterators_1.0.13 survival_3.2-13
## [31] zoo_1.8-9 glue_1.4.2 polyclip_1.10-0
## [34] gtable_0.3.0 leiden_0.3.9 future.apply_1.8.1
## [37] abind_1.4-5 scales_1.1.1 DBI_1.1.1
## [40] miniUI_0.1.1.1 riverplot_0.10 viridisLite_0.4.0
## [43] xtable_1.8-4 reticulate_1.22 spatstat.core_2.3-0
## [46] mclust_5.4.7 bit_4.0.4 rsvd_1.0.5
## [49] htmlwidgets_1.5.4 httr_1.4.2 FNN_1.1.3
## [52] RColorBrewer_1.1-2 ellipsis_0.3.2 ica_1.0-2
## [55] pkgconfig_2.0.3 R.methodsS3_1.8.1 farver_2.1.0
## [58] sass_0.4.0 uwot_0.1.10 deldir_0.2-10
## [61] utf8_1.2.2 tidyselect_1.1.1 labeling_0.4.2
## [64] reshape2_1.4.4 later_1.3.0 munsell_0.5.0
## [67] tools_4.1.1 cli_3.0.1 generics_0.1.0
## [70] evaluate_0.14 fastmap_1.1.0 yaml_2.2.1
## [73] goftest_1.2-2 knitr_1.36 bit64_4.0.5
## [76] fitdistrplus_1.1-6 purrr_0.3.4 RANN_2.6.1
## [79] pbapply_1.5-0 future_1.22.1 nlme_3.1-153
## [82] mime_0.12 R.oo_1.24.0 hdf5r_1.3.4
## [85] compiler_4.1.1 plotly_4.9.4.1 png_0.1-7
## [88] spatstat.utils_2.2-0 tibble_3.1.4 bslib_0.3.0
## [91] stringi_1.7.4 highr_0.9 RSpectra_0.16-0
## [94] lattice_0.20-45 vctrs_0.3.8 pillar_1.6.3
## [97] lifecycle_1.0.1 BiocManager_1.30.15 spatstat.geom_2.2-2
## [100] lmtest_0.9-38 jquerylib_0.1.4 RcppAnnoy_0.0.19
## [103] data.table_1.14.2 irlba_2.3.3 httpuv_1.6.3
## [106] R6_2.5.1 bookdown_0.24 promises_1.2.0.1
## [109] KernSmooth_2.23-20 gridExtra_2.3 parallelly_1.28.1
## [112] codetools_0.2-18 MASS_7.3-54 assertthat_0.2.1
## [115] withr_2.4.2 sctransform_0.3.2 mgcv_1.8-37
## [118] parallel_4.1.1 grid_4.1.1 rpart_4.1-15
## [121] rmarkdown_2.11 Rtsne_0.15 shiny_1.7.1
▶ Server Info
- Linux Version
## NAME="Ubuntu"
## VERSION="20.04.3 LTS (Focal Fossa)"
## ID=ubuntu
## ID_LIKE=debian
## PRETTY_NAME="Ubuntu 20.04.3 LTS"
## VERSION_ID="20.04"
## HOME_URL="https://www.ubuntu.com/"
## SUPPORT_URL="https://help.ubuntu.com/"
## BUG_REPORT_URL="https://bugs.launchpad.net/ubuntu/"
## PRIVACY_POLICY_URL="https://www.ubuntu.com/legal/terms-and-policies/privacy-policy"
## VERSION_CODENAME=focal
## UBUNTU_CODENAME=focal
- CPU info
## Architecture: x86_64
## CPU op-mode(s): 32-bit, 64-bit
## Byte Order: Little Endian
## Address sizes: 46 bits physical, 48 bits virtual
## CPU(s): 96
## On-line CPU(s) list: 0-95
## Thread(s) per core: 2
## Core(s) per socket: 24
## Socket(s): 2
## NUMA node(s): 2
## Vendor ID: GenuineIntel
## CPU family: 6
## Model: 85
## Model name: Intel(R) Xeon(R) Platinum 8259L CPU @ 2.50GHz
## Stepping: 6
## CPU MHz: 2500.000
## CPU max MHz: 3500.0000
## CPU min MHz: 1200.0000
## BogoMIPS: 5000.00
## Virtualization: VT-x
## L1d cache: 1.5 MiB
## L1i cache: 1.5 MiB
## L2 cache: 48 MiB
## L3 cache: 66 MiB
## NUMA node0 CPU(s): 0-23,48-71